/* memcmp - compare memory

   Copyright (C) 2013-2019 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <http://www.gnu.org/licenses/>.  */

#include "port_aarch64.h"

/* Assumptions:
 *
 * ARMv8-a, AArch64, unaligned accesses.
 */

/* Parameters and result.  */
#define src1		x0
#define src2		x1
#define limit		x2
#define result		w0

/* Internal variables.  */
#define data1		x3
#define data1w		w3
#define data1h		x4
#define data2		x5
#define data2w		w5
#define data2h		x6
#define tmp1		x7

ENTRY_ALIGN (memcmp, 6)
	DELOUSE (0)
	DELOUSE (1)
	DELOUSE (2)

	subs	limit, limit, 16
	b.lo	L(less16)		
	
	ldp	data1, data1h, [src1], 16
	ldp	data2, data2h, [src2], 16
	ccmp	data1, data2, 0, ne
	ccmp	data1h, data2h, 0, eq
	b.ne	L(return64)

	subs    limit, limit, 16
	b.ls    L(last_bytes)
	cmp     limit, 112
	b.lo    L(loop16)

	and	tmp1, src1, 15
	add	limit, limit, tmp1
	sub	src1, src1, tmp1
	sub	src2, src2, tmp1
	subs 	limit, limit, 48
	
	.p2align 4
L(loop64):
	ldp	data1, data1h, [src1]
	ldp	data2, data2h, [src2]
	cmp     data1, data2
	ccmp	data1h, data2h, 0, eq
	b.ne	L(return64)
	
	ldp	data1, data1h, [src1,16]
	ldp	data2, data2h, [src2,16]
	cmp     data1, data2
	ccmp	data1h, data2h, 0, eq
	b.ne	L(return64)
 
	ldp	data1, data1h, [src1,32]
	ldp	data2, data2h, [src2,32]
	cmp     data1, data2
	ccmp	data1h, data2h, 0, eq
	b.ne	L(return64)

	ldp	data1, data1h, [src1,48]
	ldp	data2, data2h, [src2,48]
	cmp     data1, data2
	ccmp	data1h, data2h, 0, eq
	b.ne	L(return64)

	subs    limit, limit, 64
	add	src1, src1, 64
	add	src2, src2, 64
	b.pl    L(loop64)
	adds    limit, limit, 48
	b.lo	L(last_bytes)  
	
L(loop16):
	ldp     data1, data1h, [src1], 16
	ldp     data2, data2h, [src2], 16
	cmp     data1, data2
	ccmp	data1h, data2h, 0, eq
	b.ne	L(return64)
	
	subs    limit, limit, 16
	b.hi    L(loop16)
	/* Compare last 1-16 bytes using unaligned access.  */
L(last_bytes):
	add	src1, src1, limit
	add	src2, src2, limit
	ldp	data1, data1h, [src1]
	ldp	data2, data2h, [src2]

	/* Compare data bytes and set return value to 0, -1 or 1.  */
L(return64):
    	cmp	data1, data2
	bne	L(return)
L(return_pre):
	mov	data1, data1h
	mov	data2, data2h
L(return):
#ifndef __AARCH64EB__
	rev	data1, data1
	rev	data2, data2
#endif
	cmp     data1, data2
L(ret_eq):
	cset	result, ne   
	cneg	result, result, lo
	ret

	.p2align 4
L(less16):
	adds	limit, limit, 8
	b.lo	L(less8)		//lo:<
	ldr	data1, [src1]
	ldr	data2, [src2]
	/* equal 8 optimized */
    ccmp    data1, data2, 0, ne
	b.ne	L(return)

	ldr     data1, [src1, limit]
	ldr	data2, [src2, limit]
	b	L(return)

	.p2align 4
L(less8):
	adds	limit, limit, 4
	b.lo	L(less4)
	ldr	data1w, [src1]
	ldr	data2w, [src2]
	ccmp    data1, data2, 0, ne
	b.ne	L(return)
	ldr     data1w,	[src1, limit]
	ldr     data2w,	[src2, limit]
	b	L(return)
	.p2align 4	
L(less4):
	adds	limit, limit, 4
	beq	L(ret_0)

L(byte_loop):
	ldrb	data1w, [src1], 1
	ldrb	data2w, [src2], 1
	subs	limit, limit, 1
	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
	b.eq	L(byte_loop)
	sub	result, data1w, data2w
	ret
L(ret_0):
    	mov     result, 0
    	ret

END (memcmp)
